Data Loading

Load required Libraries

rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)
library(proto)

Load in Wordbank tata

## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
                      user="wordbank",password="wordbank")

## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank,"instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")

Get kid data and put together.

# Get administration info
admins <- admin.table %>%
  select(data_id,child_id,age,source_id) %>%
  rename(id = data_id, child.id = child_id, source.id = source_id) 
admins <- as.data.frame(admins)

# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
  rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)

# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))

Set up mappings and instruments.

mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
  rename(instrument_id = id)
items <- left_join(mapping, instruments)

Fucntion for getting all of the data in wordbank for a given language (kid x item).

get.language.data <- function(lang.table, lang.items, lang, child.data) {
  
  instrument.items <- lang.items %>% 
    filter(language == lang, form == 'WS') %>%
    select(item, type, category, lexical_category) %>%
    mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
  
  instrument.data <- as.data.frame(lang.table) %>%
    rename(id = basetable_ptr_id) %>% # Rename the id
    gather(item, value, -id) %>% # Arrange in longform
    mutate(item = str_replace(item, "item_", "")) # Strip off item_ 
  
  d <- left_join(instrument.data, instrument.items)
  d <- left_join(d, child.data)
}

Get kid x item data for all languages.

d.english <- get.language.data(lang.table=english.ws.table, 
                               lang.items=items, 
                               lang="English",
                               child.data)

d.spanish <- get.language.data(lang.table=spanish.ws.table, 
                               lang.items=items, 
                               lang="Spanish",
                               child.data)

d.norwegian <- get.language.data(lang.table=norwegian.ws.table, 
                               lang.items=items, 
                               lang="Norwegian",
                               child.data)

# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
  & is.na(d.norwegian$value),]$value = ""

d.danish <- get.language.data(lang.table=danish.ws.table, 
                               lang.items=items, 
                               lang="Danish",
                               child.data)

# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
  & is.na(d.danish$value),]$value = ""

Function for getting vocab size data.

language.vocab.sizes <- function(lang.data) {
   d.vocab <- lang.data %>%
    filter(type == "word") %>%
    group_by(age,id) %>%
    summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
              vocab.mean = mean(value == "produces", na.rm=TRUE))
   
   return(d.vocab)
}

Syntax and Morphology Analyses

Function for getting kid x {vocab size, syntax score, morphology score} data.

summarise.language.data <- function(lang.data,lang) {
  
  d.vocab <- language.vocab.sizes(lang.data)

  d.complexity <- lang.data %>%
    filter(type == "complexity") %>%
    group_by(age,id) %>%
    summarise(num.complexity.na = sum(is.na(value)),
              complexity = mean(value == "complex", na.rm=TRUE))

  d.wordform <- lang.data %>%
    filter(type == "word_form") %>%
    group_by(age,id) %>%
    summarise(wordform = mean(value == "produces", na.rm=TRUE))

# Spanish doesn't have ending data, so its skipped, at least for now.
#   d.ending <- d %>%
#     filter(type %in% c("ending")) %>%
#     group_by(id) %>%
#     summarise(ending_sometimes = mean(value == "sometimes" | 
#                                       value == "often", 
#                                       na.rm=TRUE), 
#               ending_often = mean(value == "often", 
#                                   na.rm=TRUE))
#  d.composite <- left_join(d.composite, d.ending)
  
  d.composite <- left_join(d.vocab, d.complexity)
  d.composite <- left_join(d.composite, d.wordform) %>%
  filter(num.complexity.na == 0) %>%
  select(-num.complexity.na)

  d.composite$language <- lang 
  
  return(d.composite)
}

Get kid x {vocab size, syntax score, morphology score} data for all languages and aggregate them.

summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")

summary.data <- rbind_list(summary.english,summary.spanish,
                           summary.norwegian,summary.danish) %>%
  filter(age > 15 & age < 31) %>%
  mutate(age.group = cut(age, breaks = c(15, 20, 25, 30)),
         language = factor(language,
                           levels=c("English", "Spanish", "Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
  mutate(measure = factor(measure, levels = c("wordform","complexity"),
                          labels = c("Word Form", "Complexity")),
         s.vocab = scale(vocab.mean),
         s.age = scale(age))

Using Age and Vocab to predict Morphology and Syntax Scores.

#quartz(width=8,height=7.5)
ggplot(ms,aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
             label = age.group)) + 
 geom_jitter(size=.8)+
 geom_smooth(method="lm", formula = y ~ I(x^2)) + 
 facet_grid(language~measure) + 
 scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
                    name = "Vocabulary Size") + 
 scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + 
 theme_bw(base_size = 14) +
 scale_color_brewer(palette="Set1") +
 scale_fill_brewer(palette="Set1") 

Using Morphology scores to Predict Syntax scores.

#quartz()
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
            label=age.group)) + 
  facet_wrap( ~ language) +
 geom_jitter(size=1)+
 geom_smooth(method="lm", formula = y ~ x) + 
 scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),name = "Morphology Score") + 
 scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") + 
 scale_color_brewer(palette="Set1") +
 scale_fill_brewer(palette="Set1") +
 theme_bw(base_size = 14)


Vocabulary Composition Analysis

Function for computing vocabulary composition for each speaker of a language.

vocab.composition <- function(lang.data,lang) {  
  
  d.vocab <- language.vocab.sizes(lang.data)

  d.cat <- lang.data %>%
    filter(type == "word") %>%
    group_by(id,lexical_category) %>%
    summarise(cat = sum(value == "produces", na.rm=TRUE))
  
  d.vocab.comp <- left_join(d.vocab, d.cat) %>%
    mutate(prop = cat / vocab.sum) %>%
    select(-cat) 
  d.vocab.comp$language = lang
  
  return(d.vocab.comp)
}

Function for computing CDI form composition for all languages.

lang.vocab.composition <- function(lang.items) {  
  
  lang.words <- lang.items %>%
    filter(form == "WS",type=="word")
  
  lang.num.total <- lang.words %>%
    group_by(language) %>%
    summarise(n = n())
    
  lang.vocab.comp <-  lang.words %>%
    group_by(language,lexical_category) %>%
    summarise(num.per.cat = n())
  
  lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
        mutate(prop.per.cat = num.per.cat/n)

  return(lang.vocab.comp)

}

Get vocabulary composition data for all languages.

# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
  filter(lexical_category != "other")

# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")

# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
                                vocab.comp.norwegian,vocab.comp.danish) %>%
  filter(age > 15 & age < 31) %>%
  mutate(age.group = cut(age, breaks = c(15, 20, 25, 30)),
         language = factor(language,
                           levels=c("English", "Spanish", 
                                    "Norwegian", "Danish")),
         lexical_category = factor(lexical_category, 
                                   levels = c("nouns", "predicates", 
                                              "function_words", "other"),
                                   labels = c("Nouns", "Predicates", 
                                              "Function Words", "Other")))

Plot vocabulary composition by language.

ggplot(filter(summary.vocab.comp,lexical_category != "Function Words"),
       aes(x=vocab.mean, y=prop, colour=lexical_category, 
           shape = lexical_category, fill = lexical_category,
           label=lexical_category)) +
  geom_point(size = 1, alpha = 0.25) +
  facet_wrap(~ language) +
  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")


(Old stuff that’s being kept around for possible future use.)

#```{r,fig.width=12,fig.height=7.5}
#Fit regressions to data
#t.lm1 <- lm(score ~ age + measure, data=ms)
#t.lm2 <- lm(score ~ I(vocab^2)*measure + age*measure, data=ms)
#t.lm3 <- lm(score  ~ I(vocab^2)*measure*age.binned, data=ms)
#t.lm4 <- lm(score  ~ I(vocab^2)*measure*age, data=ms)

#ms$predicted <- predict.lm(t.lm3,ms)

# Plot by age
#ggplot(ms,aes(x = vocab, y = score, colour = measure,label=measure))+
#  facet_wrap(~ age)+
#  geom_jitter(size=1)+
#  geom_line(aes(y=predicted),size=.5)+
#  scale_color_brewer(palette="Set1") +
#  scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") + 
#  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + 
#  theme_bw(base_size = 14) 
#```

#Replot original correlation with fitted model
#```{r,fig.width=12,fig.height=7.5}
#ggplot(ms,aes(x = vocab, y = score, colour = age.binned, fill = age.binned,
#              label = age.binned)) + 
#  geom_jitter(size=1.5)+
#  geom_line(aes(y=predicted),size=1) + 
#  facet_wrap(~measure) + 
#  scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") + 
#  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + 
#  theme_bw(base_size = 14) +
#  scale_color_brewer(palette="Set1") +
#  scale_fill_brewer(palette="Set1") 
#```

#Compute some descriptives on syntactic items
#```{r,fig.width=7,fig.height=4}
# compute Kendall's tau -- cor.fk is a faster implementation than in stats::cor
#complex.cors <- cor.fk(as.matrix(bykid.syntax.vocab[,8:ncol(bykid.syntax.vocab)])) %>%
#  as.data.frame
#names(complex.cors) <- str_replace(names(complex.cors),"complx","")
#row.names(complex.cors) <- str_replace(row.names(complex.cors),"complx","")


# make a dendrogram of the complex item similarities
#complex.dendro <- as.dendrogram(hclust(dist(complex.cors)))

#plot(complex.dendro)
#```

#Make a confusion matrix
#```{r,fig.width=8,fig.height=5}
# gather the columns for plotting as a confusion matrix
#complex.cors %<>%
#  mutate(prompt = factor(row.names(complex.cors))) %>%
#  gather(response,correlation,"01":"37")

#ggplot(complex.cors, aes(response, prompt)) + 
#  geom_tile(aes(fill = correlation)) +
#  ylim(rev(levels(complex.cors$prompt))) +
#  scale_fill_gradient(low = "white", high = "black",guide=FALSE) +
#  labs(x="Response", y = "Prompt") +
#  theme(legend.position = "none", axis.ticks = element_blank()) +
#  theme_bw(base_size = 16)
#```

#Compute vocab x age interaction terms for each styntactic item
#```{r,fig.height=5,fig.width=8}
# write regression formulas for separately for each item
#formulas <- sapply(names(bykid.syntax.vocab)[8:ncol(bykid.syntax.vocab)],
#                   function(x) paste(x ,"~ I(vocab^2)*age + 0",collapse=""))

# compute interaction terms each item
#interaction.terms <- sapply(formulas, function(x)  
#  summary(glm(as.formula(x),data=bykid.syntax.vocab,
#              family="binomial"))$coefficients[3,3])
#names(interaction.terms) <- 1:37

#rename results to be human-readable
#interaction.terms <- as.data.frame(interaction.terms) %>%
#  mutate(item = 1:37) %>%
#  rename(zscore = interaction.terms) %>%
#  arrange(zscore) %>%
#  mutate(item = factor(item,levels=item))

# plot interaction terms by item
#ggplot(interaction.terms,
#       aes(x=item,y=zscore,fill=1))+
#  geom_bar(stat="identity")+
#  geom_hline(yintercept=mean(interaction.terms$zscore),
#             lty=2)+
#  theme_bw(base_size = 14) + 
#  scale_y_continuous(name="vocabulary size x age z-score",limits=c(0,15),
#                     breaks=seq(0,15,2.5))+
#  scale_x_discrete(name="complexity CDI item")+
#  scale_color_brewer(palette="Set1") + 
#  theme(legend.position="none")
#```  

#Leftover analyses
#```{r}
#summary(lm(syntax ~ I(vocab^2) * age - 1, data=d))
#summary(lm(morpho ~ I(vocab^2) * age - 1, data=d))

#summary(lm(syntax ~ I(d$s.vocab^2) * d$age_bin - 1, data=d))
#summary(lm(morpho ~ I(d$s.vocab^2) * d$age_bin - 1, data=d))
#```